# penstock release to monthly fractions

penstock_release_to_monthly_fractions <- function(target_plants_mapped_to_water,
                                                  daily_penstock_release){

  target_plants_mapped_to_water |>
    pull(RHPID) |>
    unique() |> 
    map_dfr(function(RHPID){

      RHPID_fn <- gsub("/", "-", RHPID)

      read_parquet(paste0(
        "./data/internals/daily_release_selected_and_partitioned/",
        RHPID_fn, ".parquet")
        ) |>
        mutate(year = year(date),
               month = month(date, label = TRUE)) |>
        summarise(mean_penstock_release = mean(penstock, na.rm = T),
                  total_from = first(total_from),
                  .by = c(year, month)) |>
        mutate(fraction = mean_penstock_release / sum(mean_penstock_release),
               .by = (year)) |> 
        #ggplot(aes(month, fraction, group = year)) + geom_line()
        mutate(RHPID = !!RHPID) |> 
        select(RHPID, year, month, fraction, total_from)

    }) -> all_dams_fractions

  
  return(all_dams_fractions)

}


nearest_yr_gen_to_monthly_fractions <- function(annual_energy_CF_water,
                                                gen_1980_2019_monthly_obs_by_RHPID){

  gen_1980_2019_monthly_obs_by_RHPID |> select(-source) |>
    split(~RHPID) |> 
    map_dfr(function(monthly_energy_obs){

      monthly_energy_obs |> 
        pivot_longer(-c(year, RHPID), values_to = "gen_MWh", names_to = "month") |> 
        mutate(month = factor(month, levels = month.abb, ordered = TRUE)) |> 
        mutate(fraction = gen_MWh / sum(gen_MWh), .by = year) |> 
        select(-gen_MWh) ->
        fractions_real
        
      fractions_real |> 
        filter(!is.na(fraction)) ->
        fractions_copy_yrs
      
        reanalysis_years |> map_dfr(function(yr){
        
        annual_energy_CF_water |> 
          filter(RHPID == fractions_real[["RHPID"]][1]) |> 
          select(year, flow_BCM) -> annual_flow_dam
        
        annual_flow_dam |> 
          filter(year %in% fractions_copy_yrs[["year"]]) |> 
          filter(year != yr) |> 
          mutate(diff = abs(flow_BCM - (
            (annual_flow_dam |> filter(year == yr))[["flow_BCM"]])
          )) |> arrange(diff) |> 
          pull(year) -> yrs_ordered
        
        fractions_real |> 
          select(-fraction) |> 
          filter(year == yr) |> 
          left_join(fractions_copy_yrs |> 
                      filter(year == yrs_ordered[1]) |> 
                      select(-year), by = c("month", "RHPID"))

      }) -> fractions_analogue_dam
      
      return(fractions_analogue_dam)

    }) -> all_dams_fractions
  
  return(all_dams_fractions)
  
}


disaggregate_gen <- function(monthly_flow_fractions,
                             monthly_supp_fractions,
                             gen_1980_2019_monthly_obs_by_RHPID,
                             annual_energy_CF_water,
                             annual_CF_models){

  monthly_flow_fractions |> 
    rename(flow_fraction = fraction) |> 
    left_join(monthly_supp_fractions,
              by = join_by(RHPID, year, month)) |> 
    rename(supp_fraction = fraction) |> 
    mutate(supp_fraction = if_else(supp_fraction < 0, 0, supp_fraction)) -> 
    monthly_fractions

  # deal with residual NAs and NaNs
  monthly_fractions |>
    split(~RHPID) |> 
    map_dfr(function(monthly_fractions_dam){

      monthly_fractions_dam |> 
        filter(is.na(flow_fraction) & is.na(supp_fraction)) -> NA_cases
      
      if(nrow(NA_cases) == 0){
        return(monthly_fractions_dam)
      }
      
      monthly_fractions_dam[["RHPID"]][1] -> RHPID
      
      annual_CF_models |> 
        filter(RHPID == !!RHPID) |> 
        select(year, flow_BCM) ->
        annual_flow_vols
      
      monthly_fractions_dam |> 
        filter(is.na(flow_fraction)) |> 
        pull(year) |> unique() ->
        missing_fraction_years
      
      # just the years with fractions
      annual_flow_vols |> 
        filter(!year %in% missing_fraction_years) ->
        annual_flow_vols_
      
      missing_fraction_years |> 
        map_dfr(function(missing_yr){
          
          annual_flow_vols |> 
            filter(year == missing_yr) |> 
            pull(flow_BCM) -> missing_yr_flow
          
          (annual_flow_vols_ |> 
              mutate(diff = abs(missing_yr_flow - flow_BCM)) |> 
              arrange(diff) |> 
              pull(year))[1] -> year_to_replicate

          tibble(year = missing_yr,
                 copy = year_to_replicate)
          
        }) -> years_to_copy
      
   
      monthly_fractions_dam |> 
        mutate(year = as.integer(year)) |> 
        filter(is.na(flow_fraction)) |> 
        select(-flow_fraction) |> 
        left_join(years_to_copy, by = "year") |> 
        select(-total_from) |> 
        left_join(
          monthly_fractions_dam |> 
            select(-supp_fraction) |> 
            filter(year %in% years_to_copy[["copy"]]) |> 
            unique(),
          by = c("RHPID","copy" = "year", "month")
        ) |> 
        mutate(total_from = "Fractions copied") |> 
        select(-copy) |> 
        bind_rows(
          monthly_fractions_dam |> 
            mutate(year = as.integer(year)) |> 
            filter(!is.na(flow_fraction))
        ) |> 
        arrange(year, month) ->
        fractions_with_copies

      return(fractions_with_copies)
    
    }) -> monthly_fractions_filled
  
  
  annual_CF_models |> 
    mutate(modeled_CF = if_else(choice == "L",
                                CF_Lmod,
                                CF_Xmod)) |> 
    select(RHPID, year, modeled_CF) ->
    annual_CF_modeled
  
  annual_energy_CF_water |> 
    filter(year %in% 2020:2022) |> 
    summarise(capacity_MWh = max(cap_MWh, na.rm = T),
              .by = "RHPID") ->
    current_capacity
  
  annual_CF_modeled |> 
    left_join(current_capacity, by = join_by(RHPID)) |> 
    # Annual climatological counter-factual generation
    mutate(modeled_CF = if_else(modeled_CF > 1, 1, modeled_CF),
           modeled_CF = if_else(modeled_CF < 0, 0, modeled_CF)) |> 
    mutate(gen_MWh_CCF = modeled_CF * capacity_MWh) |> 
    select(RHPID, year, gen_MWh_CCF) ->
    annual_gen_CCF
  
  monthly_fractions_filled |> 
    left_join(
      annual_energy_CF_water |> 
        select(year, RHPID, gen_MWh_annual = gen_MWh),
      by = join_by(RHPID, year)
    ) |> 
    left_join(annual_gen_CCF, join_by(RHPID, year)) |> 
    # remove three cases lacking generation / CF data 1980:2019
    filter(!(RHPID %in% c("58434_RED ROCK",
                       "58645_LAKE LIVINGSTON",
                       "8225_SENECA"))) ->
    monthly_fractions_with_annual_generation

  gen_1980_2019_monthly_obs_by_RHPID |> select(-source) |>
    pivot_longer(-c(year, RHPID), names_to = "month", values_to = "ObsGen_MWh") |> 
    # allow an na.approx of just two months (very rare)
    split(~RHPID) |> 
    map_dfr(function(dam_table){

      dam_table |> 
        mutate(ObsGen_MWh = na.approx(dam_table[["ObsGen_MWh"]], maxgap = 2,
                                      na.rm = F)) ->
        dam_table_gap_filled
      
      dam_table_gap_filled |>
        # use for years with remaining NAs... intended to drop whole year
        mutate(mean_ = mean(ObsGen_MWh), .by = year) |> 
        mutate(ObsGen_MWh = if_else(is.na(mean_),
                                    NA_real_,
                                    ObsGen_MWh)) ->
        dam_table_gap_filled_
      
      return(dam_table_gap_filled_)
      
    }) -> ObsGen
  
  monthly_fractions_with_annual_generation |> 
    mutate(ActGen_MWh_flow = gen_MWh_annual * flow_fraction,
           ActGen_MWh_supp = gen_MWh_annual * supp_fraction,
           CCfGen_MWh_flow = gen_MWh_CCF * flow_fraction,
           CCfGen_MWh_supp = gen_MWh_CCF * supp_fraction) |> 
    select(RHPID, year, month, total_from,
           ActGen_MWh_flow, ActGen_MWh_supp, CCfGen_MWh_flow, CCfGen_MWh_supp) |> 
    left_join(flow_priority, by = c("total_from" = "data_from")) |> 
    select(-total_from, -rank) |> 
    left_join(ObsGen, join_by(RHPID, year, month)) ->
    monthly_ActGen_CCfGen_ObsGen_1980_2019

  return(monthly_ActGen_CCfGen_ObsGen_1980_2019)

}

select_final_gen <- function(target_plants_mapped_to_water,
                             annual_energy_CF_water,
                             monthly_ActGen_CCfGen_ObsGen_1980_2019,
                             QFER_gen){

  # generate table of hrs per year for computation of maximum output
  tibble(
    date = seq.Date(from = ymd("1980-01-01"), to = ymd("2019-12-31"), by = 1)
  ) |>
    mutate(year = as.integer(year(date)), month = month(date, label = T)) ->
    date_table
    
  date_table |> 
    summarise(n_hrs = n() * 24, .by = year) ->
    hrs_per_year
  
  date_table |> 
    summarise(n_hrs = n() * 24, .by = c(month, year)) ->
    hrs_per_month_and_year
  
  annual_energy_CF_water |> 
    select(RHPID, year, cap_MWh) |> 
    left_join(hrs_per_year) |> 
    mutate(nameplate_MW = cap_MWh / n_hrs) |> 
    mutate(nameplate_MW = if_else(nameplate_MW <= 0, NA_real_, nameplate_MW)) |> 
    select(RHPID, year, nameplate_MW) -> nameplates

  target_plants_mapped_to_water |> 
    arrange(nidid) |> select(RHPID) |> unique() %>%
    mutate(RHPID_ = factor(RHPID, levels = .$RHPID, ordered = T)) ->
    RHPID_ordered
  
  # capacity exceedence checks required!!
  
  # Check each year for a capacity exceedence violation. If present, ...
  # ... replace flow-based disaggregation with the supplementary.
  
  monthly_ActGen_CCfGen_ObsGen_1980_2019 |> 
    left_join(nameplates, by = join_by(RHPID, year)) |> 
    left_join(hrs_per_month_and_year, by = join_by(year, month)) |> 
    mutate(RHPID_year = paste0(RHPID, "-", year)) |> 
    split(~RHPID_year) |> 
    map_dfr(function(RHPID_year){
      
      if(any(is.na(RHPID_year[["nameplate_MW"]]))) return(RHPID_year)
      if(any(is.na(RHPID_year[["ActGen_MWh_flow"]]))) return(RHPID_year)
      if(any(is.na(RHPID_year[["CCfGen_MWh_supp"]]))) return(RHPID_year)
      
      RHPID_year |> 
        # allow for 25% overshoot
        mutate(max_gen_by_cap = 1.25 * (nameplate_MW * n_hrs),
               diff = max_gen_by_cap - ActGen_MWh_flow) |> 
        pull(diff) -> diffs
      
      if(!any(diffs < 0)) return(RHPID_year)
      
      message(paste0("CAPACITY FIX: ", RHPID_year[["RHPID_year"]][1]))
      
      return(
        RHPID_year |> 
          mutate(ActGen_MWh_flow = NA_real_,
                 CCfGen_MWh_flow = NA_real_,
                 quality_label = "CAPACITY_FIX")
      )

    }) -> monthly_ActGen_CCfGen_ObsGen_1980_2019_CAPFIX
  
  
  # actual generation with EIA backfill
  monthly_ActGen_CCfGen_ObsGen_1980_2019_CAPFIX |> 
    mutate(quality_label = case_when(
      !is.na(ObsGen_MWh) ~ "X. Observed Monthly Net Generation (EIA-923_M / CEC-QDEF)",
      is.na(ActGen_MWh_flow) & is.na(ActGen_MWh_supp) & is.na(ObsGen_MWh) ~
        "N/A - No EIA annual survey data to disaggregate",
      is.na(ActGen_MWh_flow) & is.na(ObsGen_MWh) & !is.na(ActGen_MWh_supp) ~
        "F2. Weakest proxy: Monthly pattern copied from similar year at plant",
      TRUE ~ quality_label
    )) |> 
    mutate(
      gen_MWh = case_when(
        quality_label == "X. Observed Monthly Net Generation (EIA-923_M / CEC-QDEF)" ~ ObsGen_MWh,
        !is.na(ActGen_MWh_flow) ~ ActGen_MWh_flow,
        TRUE ~ ActGen_MWh_supp
      )
    ) |> 
    select(RHPID, gen_MWh, quality_label, year, month) |> 
    mutate(gen_MWh = round(gen_MWh, 0)) |> 
    pivot_wider(values_from = gen_MWh, names_from = month) |> 
    left_join(RHPID_ordered, by = join_by(RHPID)) |>
    arrange(RHPID_) |> select(-RHPID_) |> 
    mutate(year = as.integer(year)) |> 
    select(RHPID, year, Jan, Feb, Mar, Apr, May, Jun,
           Jul, Aug, Sep, Oct, Nov, Dec, quality_label) ->
    RectifHydPlus_actual_backfilled  
  
  # actual generation no backfill
  monthly_ActGen_CCfGen_ObsGen_1980_2019_CAPFIX |> 
    mutate(quality_label = case_when(
      is.na(ActGen_MWh_flow) & is.na(ActGen_MWh_supp) & is.na(ObsGen_MWh) ~
        "N/A - No EIA annual survey data to disaggregate",
      is.na(ActGen_MWh_flow) & !is.na(ActGen_MWh_supp) ~
        "F2. Weakest proxy: Monthly pattern copied from similar year at plant",
      TRUE ~ quality_label
    )) |> 
    mutate(
      gen_MWh = case_when(
        !is.na(ActGen_MWh_flow) ~ ActGen_MWh_flow,
        TRUE ~ ActGen_MWh_supp
      )
    ) |> 
    select(RHPID, gen_MWh, quality_label, year, month) |> 
    mutate(gen_MWh = round(gen_MWh, 0)) |> 
    pivot_wider(values_from = gen_MWh, names_from = month) |> 
    left_join(RHPID_ordered, by = join_by(RHPID)) |>
    arrange(RHPID_) |> select(-RHPID_) |> 
    mutate(year = as.integer(year)) |> 
    select(RHPID, year, Jan, Feb, Mar, Apr, May, Jun,
           Jul, Aug, Sep, Oct, Nov, Dec, quality_label) ->
    RectifHydPlus_actual_xBackfill
  
  
  # climatology counterfactual case
  monthly_ActGen_CCfGen_ObsGen_1980_2019_CAPFIX |> 
    mutate(quality_label = case_when(
      is.na(CCfGen_MWh_flow) & !is.na(CCfGen_MWh_supp) ~
        "F2. Weakest proxy: Monthly pattern copied from similar year at plant",
      TRUE ~ quality_label
    )) |> 
    mutate(
      gen_MWh = case_when(
        !is.na(CCfGen_MWh_flow) ~ CCfGen_MWh_flow,
        TRUE ~ CCfGen_MWh_supp
      )
    ) |> 
    select(RHPID, gen_MWh, quality_label, year, month) |> 
    mutate(gen_MWh = round(gen_MWh, 0)) |>
    pivot_wider(values_from = gen_MWh, names_from = month) |> 
    left_join(RHPID_ordered, by = join_by(RHPID)) |>
    arrange(RHPID_) |> select(-RHPID_) |> 
    mutate(year = as.integer(year)) |> 
    select(RHPID, year, Jan, Feb, Mar, Apr, May, Jun,
           Jul, Aug, Sep, Oct, Nov, Dec, quality_label) ->
            RectifHydPlus_HydrologicalControl

  # create RHP reference table
  QFER_gen |> select(CEC = CECPlantID, EIA = EIAPlantID) |>
    unique() |> mutate(EIA = as.integer(EIA)) -> CEC_EIA
  target_plants_mapped_to_water |>
    mutate(EIA_ID = as.integer(EIA_ID), grand_id = as.integer(grand_id)) |>
    select(RHPID, EIA = EIA_ID, COMPLX = COMPLXID, NID = nidid, GRAND = grand_id, COMID) |>
    left_join(CEC_EIA) -> RHP_reference_table

  if(!dir.exists("outputs")) dir.create("outputs/")
  if(!dir.exists("outputs/RectifHydPlus_supplemental/")) dir.create("outputs/RectifHydPlus_supplemental/")
  if(!dir.exists("outputs/RectifHydPlus_misc/")) dir.create("outputs/RectifHydPlus_misc/")

  # write RectifHydPlus to file
  write_csv(RectifHydPlus_actual_backfilled, "outputs/RectifHydPlus_NetGen_MWh_v1.1.csv")
  write_csv(RectifHydPlus_actual_xBackfill, "outputs/RectifHydPlus_supplemental/RectifHydPlus_NetGen_MWh_v1.1_NoObservations.csv")
  write_csv(RectifHydPlus_HydrologicalControl, "outputs/RectifHydPlus_supplemental/RectifHydPlus_NetGen_MWh_v1.1_HydrologicalControl.csv")
  write_csv(RHP_reference_table, "outputs/RectifHydPlus_misc/RectifHydPlus_Reference-table.csv")

  return(
    bind_rows(
      RectifHydPlus_actual_xBackfill |> mutate(data = "HISTORICAL excluding EIA backfill"),
      RectifHydPlus_actual_backfilled |> mutate(data = "HISTORICAL with EIA/CEC backfill"),
      RectifHydPlus_HydrologicalControl |> mutate(data = "Hydological Control")
    )
  )
     
}
